{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "from peewee import *"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "database = MySQLDatabase('wgs', **{'charset': 'utf8', 'sql_mode': 'PIPES_AS_CONCAT', 'use_unicode': True, 'host': 'localhost', 'port': 3306, 'user': 'root', 'password': '179353'})\n",
    "\n",
    "class BaseModel(Model):\n",
    "    class Meta:\n",
    "        database = database\n",
    "\n",
    "class QcBeforeFiltering(BaseModel):\n",
    "    batch=CharField()\n",
    "    Samples = CharField()\n",
    "    Raw_total_reads=BigIntegerField()\n",
    "    Raw_total_bases=BigIntegerField()\n",
    "    Raw_q20_bases=BigIntegerField()\n",
    "    Raw_q30_bases=BigIntegerField()\n",
    "    Raw_q20_rate=FloatField()\n",
    "    Raw_q30_rate=FloatField()\n",
    "    Raw_read1_mean_length=IntegerField()\n",
    "    Raw_read2_mean_length=IntegerField()\n",
    "    Raw_gc_content=FloatField()\n",
    "\n",
    "    class Meta:\n",
    "        table_name = 'before_filtering'\n",
    "\n",
    "\n",
    "class QcAfterFiltering(BaseModel):\n",
    "    batch=CharField()\n",
    "    Samples = CharField()\n",
    "    Clean_total_reads=BigIntegerField()\n",
    "    Clean_total_bases=BigIntegerField()\n",
    "    Clean_q20_bases=BigIntegerField()\n",
    "    Clean_q30_bases=BigIntegerField()\n",
    "    Clean_q20_rate=FloatField()\n",
    "    Clean_q30_rate=FloatField()\n",
    "    Clean_read1_mean_length=IntegerField()\n",
    "    Clean_read2_mean_length=IntegerField()\n",
    "    Clean_gc_content=FloatField()\n",
    "\n",
    "    class Meta:\n",
    "        table_name = 'after_filtering'\n",
    "\n",
    "\n",
    "\n",
    "class QcFilteringResult(BaseModel):\n",
    "    batch=CharField()\n",
    "    Samples = CharField()\n",
    "    passed_filter_reads=BigIntegerField()\n",
    "    low_quality_reads=BigIntegerField()\n",
    "    too_many_N_reads=BigIntegerField()\n",
    "    too_short_reads=BigIntegerField()\n",
    "    too_long_reads=BigIntegerField()\n",
    "    class Meta:\n",
    "        table_name = 'filtering_result'\n",
    "        \n",
    "class Admixtools(BaseModel):\n",
    "    batch = CharField()\n",
    "    A = CharField()\n",
    "    B = CharField()\n",
    "    X = CharField()\n",
    "    C = CharField()\n",
    "    O = CharField()\n",
    "    alpha = FloatField()\n",
    "    stderr = FloatField()\n",
    "    Zscore = FloatField()\n",
    "\n",
    "    class Meta:\n",
    "        table_name = 'qpf4ratio'\n",
    "\n",
    "def create_table(table):\n",
    "    u\"\"\"\n",
    "    如果table不存在，新建table\n",
    "    \"\"\"\n",
    "    if not table.table_exists():\n",
    "        table.create_table()\n",
    "\n",
    "def drop_table(table):\n",
    "    u\"\"\"\n",
    "    table 存在，就删除\n",
    "    \"\"\"\n",
    "    if table.table_exists():\n",
    "        table.drop_table()\n",
    "        \n",
    "class SampleInfo(BaseModel):\n",
    "    batch = CharField()\n",
    "    fq1 = CharField()\n",
    "    fq2 = CharField()\n",
    "    adapter = CharField()\n",
    "    library = CharField()\n",
    "    sample = CharField()\n",
    "    class Meta:\n",
    "        table_name = 'sample_info'\n",
    "        \n",
    "class KilogenomePop(BaseModel):\n",
    "    continent = CharField()\n",
    "    country = CharField()\n",
    "    population = CharField()\n",
    "\n",
    "    class Meta:\n",
    "        table_name = 'kilogenome_pop'\n",
    "        \n",
    "        \n",
    "class Admixture(BaseModel):\n",
    "    batch = CharField()\n",
    "    Samples = CharField()\n",
    "    GBR = FloatField()\n",
    "    FIN = FloatField()\n",
    "    CHS = FloatField()\n",
    "    PUR = FloatField()\n",
    "    CDX = FloatField()\n",
    "    CLM = FloatField()\n",
    "    IBS = FloatField()\n",
    "    PEL = FloatField()\n",
    "    PJL = FloatField()\n",
    "    KHV = FloatField()\n",
    "    ACB = FloatField()\n",
    "    GWD = FloatField()\n",
    "    ESN = FloatField()\n",
    "    BEB = FloatField()\n",
    "    MSL = FloatField()\n",
    "    STU = FloatField()\n",
    "    ITU = FloatField()\n",
    "    CEU = FloatField()\n",
    "    YRI = FloatField()\n",
    "    CHB = FloatField()\n",
    "    JPT = FloatField()\n",
    "    LWK = FloatField()\n",
    "    ASW = FloatField()\n",
    "    MXL = FloatField()\n",
    "    TSI = FloatField()\n",
    "    GIH = FloatField()\n",
    "\n",
    "    class Meta:\n",
    "        table_name = 'Admixture_stat'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 306,
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "create_table(SampleInfo)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 121,
   "metadata": {},
   "outputs": [],
   "source": [
    "drop_table(QcBeforeFiltering)\n",
    "drop_table(QcAfterFiltering)\n",
    "drop_table(QcFilterResult)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 301,
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "\n",
    "\n",
    "\n",
    "database = MySQLDatabase('wgs', **{'charset': 'utf8', 'sql_mode': 'PIPES_AS_CONCAT', 'use_unicode': True, 'host': 'localhost', 'port': 3306, 'user': 'root', 'password': '179353'})\n",
    "\n",
    "\n",
    "\n",
    "class Base(object):\n",
    "\n",
    "\n",
    "    def __init__(self,batch):\n",
    "        self.batch=batch\n",
    "        self.context=self.batch.split('_')\n",
    "        self.username=self.context[-1]\n",
    "        self.analysis=self.context[0]\n",
    "        self.project='_'.join(self.context[1:-1])\n",
    "        self.save_dir=os.path.join('/Users/linlian/Documents/GitHub/Keygene/Keygene/media/file',self.username)\n",
    "        self.save_dir=os.path.join(self.save_dir,self.project)\n",
    "        self.save_dir=os.path.join(self.save_dir,self.analysis)\n",
    "\n",
    "\n",
    "\n",
    "    def create_json_list(self,df,table):\n",
    "        df['batch']=self.batch\n",
    "        test=json.loads(df.to_json(orient='records'))\n",
    "        create_table(table)\n",
    "        with database.atomic():\n",
    "            for i in range(0, len(test), 100):\n",
    "                table.insert_many(test[i:i+100]).execute()\n",
    "\n",
    "\n",
    "\n",
    "class Qc(Base):\n",
    "    def __init__(self,batch):\n",
    "        super(Qc,self).__init__(batch)\n",
    "        self.after_filtering_sql()\n",
    "        self.before_filtering_sql()\n",
    "        self.filtering_result_sql()\n",
    "\n",
    "\n",
    "    def after_filtering_sql(self):\n",
    "        df=pd.read_csv(os.path.join(self.save_dir,'qc/after_filtering.csv'))\n",
    "        self.create_json_list(df,QcAfterFiltering)\n",
    "        \n",
    "\n",
    "    def before_filtering_sql(self):\n",
    "        df=pd.read_csv(os.path.join(self.save_dir,'qc/before_filtering.csv'))\n",
    "        self.create_json_list(df,QcBeforeFiltering)\n",
    "\n",
    "    def filtering_result_sql(self):\n",
    "        df=pd.read_csv(os.path.join(self.save_dir,'qc/filtering_result.csv'))\n",
    "        self.create_json_list(df,QcFilteringResult)\n",
    "        \n",
    "        \n",
    "class InputInfo(Base):\n",
    "    def __init__(self,batch):\n",
    "        super(InputInfo,self).__init__(batch)\n",
    "\n",
    "    def info(self):\n",
    "        data=[]\n",
    "        with open(os.path.join(self.save_dir,'input.txt'),'rt') as f:\n",
    "            for k,v in enumerate(f):\n",
    "                if k == 0:\n",
    "                    pass\n",
    "                else:\n",
    "                    line=v.split()\n",
    "                    fq1=line[0]\n",
    "                    fq2=line[1]\n",
    "                    adapter=line[2]\n",
    "                    library=line[3]\n",
    "                    sample=line[4]\n",
    "                    data.append({'batch':self.batch,'fq1':fq1,'fq2':fq2,'adapter':adapter,'library':library,'sample':sample})\n",
    "        self.insert_db(SampleInfo,data)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [],
   "source": [
    "kg = {'东亚': ['CHB', 'JPT', 'CHS', 'KHV', 'CDX'], '南亚': ['ITU', 'BEB', 'STU', 'GIH', 'PJL'], '欧洲': [\n",
    "        'FIN', 'CEU', 'TSI', 'GBR', 'IBS'], '美洲': ['PEL', 'MXL', 'CLM', 'PUR'], \n",
    "        '非洲': ['ASW', 'ACB', 'GWD', 'ESN', 'LWK', 'MSL', 'YRI']}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "batch='demo_demo_lianlin_lianlin'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "table=Admixture.select().where(Admixture.batch==batch)\n",
    "kg=KilogenomePop.select()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "df=pd.DataFrame.from_dict(table.dicts())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "kilogenome=pd.DataFrame.from_dict(kg.dicts())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [],
   "source": [
    "population_country=dict(zip(kilogenome['population'],kilogenome['country']))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [],
   "source": [
    "df2=df[df['Samples'] == 'P002190812016']\n",
    "df2.set_index('Samples',inplace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "ename": "TypeError",
     "evalue": "'>=' not supported between instances of 'KilogenomePop' and 'int'",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mTypeError\u001b[0m                                 Traceback (most recent call last)",
      "\u001b[0;32m<ipython-input-24-4428b47dc738>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m      1\u001b[0m \u001b[0mpop\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      2\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mp\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mkg\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 3\u001b[0;31m     \u001b[0meas\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mdf2\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mkg\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mp\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m      4\u001b[0m     \u001b[0meas\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0meas\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mreset_index\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      5\u001b[0m     \u001b[0meas_sub\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mdefaultdict\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mlist\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m/usr/local/lib/python3.7/site-packages/peewee.py\u001b[0m in \u001b[0;36m__getitem__\u001b[0;34m(self, value)\u001b[0m\n\u001b[1;32m   1939\u001b[0m             \u001b[0mindex\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mvalue\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1940\u001b[0m         \u001b[0;32mif\u001b[0m \u001b[0mindex\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1941\u001b[0;31m             \u001b[0mindex\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mindex\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0;36m1\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mindex\u001b[0m \u001b[0;34m>=\u001b[0m \u001b[0;36m0\u001b[0m \u001b[0;32melse\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m   1942\u001b[0m         \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_cursor_wrapper\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfill_cache\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mindex\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1943\u001b[0m         \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_cursor_wrapper\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrow_cache\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mvalue\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;31mTypeError\u001b[0m: '>=' not supported between instances of 'KilogenomePop' and 'int'"
     ]
    }
   ],
   "source": [
    "pop=[]\n",
    "for p in kg:\n",
    "    eas=df2[kg[p]]\n",
    "    eas=eas.reset_index()\n",
    "    eas_sub=defaultdict(list)\n",
    "    eas_sub=df_tuple(eas_sub,eas)\n",
    "    eas_sub=eas_sub[sample]\n",
    "    eas_sub=sorted(eas_sub,key=lambda x:x[1],reverse=True)\n",
    "    eas_sub=list(map(lambda x:[x[0],format(x[1], '.4%')],eas_sub))\n",
    "    pop.append((p,eas_sub))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>CHB</th>\n",
       "      <th>JPT</th>\n",
       "      <th>CHS</th>\n",
       "      <th>KHV</th>\n",
       "      <th>CDX</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Samples</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>P002190812016</th>\n",
       "      <td>0.495569</td>\n",
       "      <td>0.052568</td>\n",
       "      <td>0.400288</td>\n",
       "      <td>0.00001</td>\n",
       "      <td>0.051354</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                    CHB       JPT       CHS      KHV       CDX\n",
       "Samples                                                       \n",
       "P002190812016  0.495569  0.052568  0.400288  0.00001  0.051354"
      ]
     },
     "execution_count": 28,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df2[kg['东亚']]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['CHB', 'JPT', 'CHS', 'KHV', 'CDX']"
      ]
     },
     "execution_count": 30,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "kg['东亚']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'东亚': ['CHB', 'JPT', 'CHS', 'KHV', 'CDX'],\n",
       " '南亚': ['ITU', 'BEB', 'STU', 'GIH', 'PJL'],\n",
       " '欧洲': ['FIN', 'CEU', 'TSI', 'GBR', 'IBS'],\n",
       " '美洲': ['PEL', 'MXL', 'CLM', 'PUR'],\n",
       " '非洲': ['ASW', 'ACB', 'GWD', 'ESN', 'LWK', 'MSL', 'YRI']}"
      ]
     },
     "execution_count": 29,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "kg"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
